import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snsThis post includes code and notes from this gist and this post.
import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np%load_ext sqlengine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()tables = pd.read_sql("""SELECT *
FROM sqlite_master
WHERE type='table';""", connection)
tables| type | name | tbl_name | rootpage | sql | |
|---|---|---|---|---|---|
| 0 | table | sqlite_sequence | sqlite_sequence | 4 | CREATE TABLE sqlite_sequence(name,seq) |
| 1 | table | Player_Attributes | Player_Attributes | 11 | CREATE TABLE "Player_Attributes" (\n\t`id`\tIN... |
| 2 | table | Player | Player | 14 | CREATE TABLE `Player` (\n\t`id`\tINTEGER PRIMA... |
| 3 | table | Match | Match | 18 | CREATE TABLE `Match` (\n\t`id`\tINTEGER PRIMAR... |
| 4 | table | League | League | 24 | CREATE TABLE `League` (\n\t`id`\tINTEGER PRIMA... |
| 5 | table | Country | Country | 26 | CREATE TABLE `Country` (\n\t`id`\tINTEGER PRIM... |
| 6 | table | Team | Team | 29 | CREATE TABLE "Team" (\n\t`id`\tINTEGER PRIMARY... |
| 7 | table | Team_Attributes | Team_Attributes | 2 | CREATE TABLE `Team_Attributes` (\n\t`id`\tINTE... |
| 8 | table | Match_df | Match_df | 3 | CREATE TABLE Match_df(\n id INT,\n country_n... |
| 9 | table | Match_Wins | Match_Wins | 308451 | CREATE TABLE Match_Wins(\n id INT,\n country... |
%%sql
SELECT *
FROM Match
LIMIT 3;Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
postgresql://username:password@hostname/dbname
or an existing connection: dict_keys([])
connection<sqlalchemy.engine.base.Connection at 0x7f785f788c50>
match_wins = pd.read_sql("""SELECT *
FROM Match_Wins;""", connection)# sql_query = %sql SELECT * FROM Match_Wins
# df = sql_query.DataFrame()
# dfmatch_wins| id | country_name | league_name | season | stage | date | home_team | away_team | home_team_goal | away_team_goal | home_team_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24559 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-18 00:00:00 | BSC Young Boys | FC Basel | 1 | 2 | 0 |
| 1 | 24560 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-19 00:00:00 | FC Aarau | FC Sion | 3 | 1 | 1 |
| 2 | 24561 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-20 00:00:00 | FC Luzern | FC Vaduz | 1 | 2 | 0 |
| 3 | 24562 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-20 00:00:00 | Neuchâtel Xamax | FC Zürich | 1 | 2 | 0 |
| 4 | 24613 | Switzerland | Switzerland Super League | 2008/2009 | 2 | 2008-07-23 00:00:00 | FC Basel | Grasshopper Club Zürich | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25974 | 25945 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Basel | Grasshopper Club Zürich | 0 | 1 | 0 |
| 25975 | 25946 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | Lugano | FC St. Gallen | 3 | 0 | 1 |
| 25976 | 25947 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Luzern | FC Sion | 2 | 2 | 0 |
| 25977 | 25948 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Thun | BSC Young Boys | 0 | 3 | 0 |
| 25978 | 25949 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Zürich | FC Vaduz | 3 | 1 | 1 |
25979 rows × 11 columns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
profile = ProfileReport(match_wins, title='Pandas Profiling Report')profile.to_widgets()
| id | country_name | league_name | season | stage | date | home_team | away_team | home_team_goal | away_team_goal | home_team_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24559 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-18 00:00:00 | BSC Young Boys | FC Basel | 1 | 2 | 0 |
| 1 | 24560 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-19 00:00:00 | FC Aarau | FC Sion | 3 | 1 | 1 |
| 2 | 24561 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-20 00:00:00 | FC Luzern | FC Vaduz | 1 | 2 | 0 |
| 3 | 24562 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-20 00:00:00 | Neuchâtel Xamax | FC Zürich | 1 | 2 | 0 |
| 4 | 24613 | Switzerland | Switzerland Super League | 2008/2009 | 2 | 2008-07-23 00:00:00 | FC Basel | Grasshopper Club Zürich | 1 | 0 | 1 |
| 5 | 24614 | Switzerland | Switzerland Super League | 2008/2009 | 2 | 2008-07-23 00:00:00 | AC Bellinzona | Neuchâtel Xamax | 1 | 2 | 0 |
| 6 | 24615 | Switzerland | Switzerland Super League | 2008/2009 | 2 | 2008-07-23 00:00:00 | FC Zürich | FC Luzern | 1 | 0 | 1 |
| 7 | 24616 | Switzerland | Switzerland Super League | 2008/2009 | 2 | 2008-07-24 00:00:00 | FC Sion | BSC Young Boys | 2 | 1 | 1 |
| 8 | 24617 | Switzerland | Switzerland Super League | 2008/2009 | 2 | 2008-07-24 00:00:00 | FC Vaduz | FC Aarau | 0 | 2 | 0 |
| 9 | 24668 | Switzerland | Switzerland Super League | 2008/2009 | 3 | 2008-07-26 00:00:00 | FC Basel | AC Bellinzona | 2 | 0 | 1 |
| id | country_name | league_name | season | stage | date | home_team | away_team | home_team_goal | away_team_goal | home_team_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 25969 | 25940 | Switzerland | Switzerland Super League | 2015/2016 | 35 | 2016-05-22 00:00:00 | Grasshopper Club Zürich | FC Thun | 0 | 0 | 0 |
| 25970 | 25941 | Switzerland | Switzerland Super League | 2015/2016 | 35 | 2016-05-22 00:00:00 | FC Sion | FC Zürich | 2 | 2 | 0 |
| 25971 | 25942 | Switzerland | Switzerland Super League | 2015/2016 | 35 | 2016-05-22 00:00:00 | FC Vaduz | Lugano | 0 | 0 | 0 |
| 25972 | 25943 | Switzerland | Switzerland Super League | 2015/2016 | 35 | 2016-05-22 00:00:00 | BSC Young Boys | FC Basel | 2 | 3 | 0 |
| 25973 | 25944 | Switzerland | Switzerland Super League | 2015/2016 | 35 | 2016-05-22 00:00:00 | FC St. Gallen | FC Luzern | 1 | 4 | 0 |
| 25974 | 25945 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Basel | Grasshopper Club Zürich | 0 | 1 | 0 |
| 25975 | 25946 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | Lugano | FC St. Gallen | 3 | 0 | 1 |
| 25976 | 25947 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Luzern | FC Sion | 2 | 2 | 0 |
| 25977 | 25948 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Thun | BSC Young Boys | 0 | 3 | 0 |
| 25978 | 25949 | Switzerland | Switzerland Super League | 2015/2016 | 36 | 2016-05-25 00:00:00 | FC Zürich | FC Vaduz | 3 | 1 | 1 |
profile.to_notebook_iframe()
profile.to_file(output_file="pandas_profiling.html")
match_wins.head()| id | country_name | league_name | season | stage | date | home_team | away_team | home_team_goal | away_team_goal | home_team_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24559 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-18 00:00:00 | BSC Young Boys | FC Basel | 1 | 2 | 0 |
| 1 | 24560 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-19 00:00:00 | FC Aarau | FC Sion | 3 | 1 | 1 |
| 2 | 24561 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-20 00:00:00 | FC Luzern | FC Vaduz | 1 | 2 | 0 |
| 3 | 24562 | Switzerland | Switzerland Super League | 2008/2009 | 1 | 2008-07-20 00:00:00 | Neuchâtel Xamax | FC Zürich | 1 | 2 | 0 |
| 4 | 24613 | Switzerland | Switzerland Super League | 2008/2009 | 2 | 2008-07-23 00:00:00 | FC Basel | Grasshopper Club Zürich | 1 | 0 | 1 |
cols = match_wins.columns
colours = ['darkblue', 'red']
sns.heatmap(match_wins[cols].isnull(), cmap=sns.color_palette(colours))<AxesSubplot:>

# top = match_wins["home_team_win"].describe()['top'] # impute with the most frequent value.
# match_wins["home_team_win"] = match_wins["home_team_win"].fillna(top)pct_list = []
for col in match_wins.columns:
pct_missing = np.mean(match_wins[col].isnull())
if round(pct_missing*100) >0:
pct_list.append([col, round(pct_missing*100)])
print('{} - {}%'.format(col, round(pct_missing*100)))id - 0%
country_name - 0%
league_name - 0%
season - 0%
stage - 0%
date - 0%
home_team - 0%
away_team - 0%
home_team_goal - 0%
away_team_goal - 0%
home_team_win - 0%
match_wins.country_name0 Switzerland
1 Switzerland
2 Switzerland
3 Switzerland
4 Switzerland
...
25974 Switzerland
25975 Switzerland
25976 Switzerland
25977 Switzerland
25978 Switzerland
Name: country_name, Length: 25979, dtype: object
# # extracting the titles from the names:
# Title = []
# for name in match_wins.country_name:
# Title.append(name.split(",")[1].split(".")[0])
# match_wins["Team"] = Titlematch_wins.groupby(["home_team", 'season'])['home_team_win'].agg(['sum']).round(0)| sum | ||
|---|---|---|
| home_team | season | |
| 1. FC Kaiserslautern | 2010/2011 | 6 |
| 2011/2012 | 2 | |
| 1. FC Köln | 2008/2009 | 4 |
| 2009/2010 | 3 | |
| 2010/2011 | 11 | |
| ... | ... | ... |
| Śląsk Wrocław | 2011/2012 | 9 |
| 2012/2013 | 9 | |
| 2013/2014 | 5 | |
| 2014/2015 | 9 | |
| 2015/2016 | 5 |
1478 rows × 1 columns
df = df.drop(columns = ["Name"])
df = df.drop(columns = ["PassengerId"])
df = df.drop(columns = ["Ticket"])match_wins.dtypesid int64
country_name int8
league_name int8
season int8
stage int64
date int16
home_team int16
away_team int16
home_team_goal int64
away_team_goal int64
home_team_win int64
dtype: object
match_wins.country_name = pd.Categorical(match_wins.country_name)
match_wins.league_name = pd.Categorical(match_wins.league_name)
match_wins.season = pd.Categorical(match_wins.season)
match_wins.date = pd.Categorical(match_wins.date)match_wins["country_name"] = match_wins.country_name.cat.codesmatch_wins["league_name"] = match_wins.league_name.cat.codes
match_wins["season"] = match_wins.season.cat.codes
match_wins["date"] = match_wins.date.cat.codesmatch_wins.home_team = pd.Categorical(match_wins.home_team)match_wins.away_team = pd.Categorical(match_wins.away_team)match_wins["away_team"] = match_wins.away_team.cat.codesmatch_wins["home_team"] = match_wins.home_team.cat.codesmatch_wins["home_team"]0 24
1 72
2 84
3 173
4 76
...
25974 76
25975 160
25976 84
25977 95
25978 100
Name: home_team, Length: 25979, dtype: int16
match_wins.date = pd.Categorical(match_wins.date)match_wins["date"] = match_wins.date.cat.codesmatch_wins| id | country_name | league_name | season | stage | date | home_team | away_team | home_team_goal | away_team_goal | home_team_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24559 | 10 | 10 | 0 | 1 | 0 | 24 | 76 | 1 | 2 | 0 |
| 1 | 24560 | 10 | 10 | 0 | 1 | 1 | 72 | 91 | 3 | 1 | 1 |
| 2 | 24561 | 10 | 10 | 0 | 1 | 2 | 84 | 98 | 1 | 2 | 0 |
| 3 | 24562 | 10 | 10 | 0 | 1 | 2 | 173 | 100 | 1 | 2 | 0 |
| 4 | 24613 | 10 | 10 | 0 | 2 | 3 | 76 | 117 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25974 | 25945 | 10 | 10 | 7 | 36 | 1693 | 76 | 117 | 0 | 1 | 0 |
| 25975 | 25946 | 10 | 10 | 7 | 36 | 1693 | 160 | 93 | 3 | 0 | 1 |
| 25976 | 25947 | 10 | 10 | 7 | 36 | 1693 | 84 | 91 | 2 | 2 | 0 |
| 25977 | 25948 | 10 | 10 | 7 | 36 | 1693 | 95 | 24 | 0 | 3 | 0 |
| 25978 | 25949 | 10 | 10 | 7 | 36 | 1693 | 100 | 98 | 3 | 1 | 1 |
25979 rows × 11 columns
match_wins.dtypesid int64
country_name int8
league_name int8
season int8
stage int64
date int16
home_team int16
away_team int16
home_team_goal int64
away_team_goal int64
home_team_win int64
dtype: object
#match_wins = match_wins.drop(columns = ["Title"])
target = match_wins.home_team_win.values
match_wins = match_wins.drop(columns =["home_team_win"])match_wins| id | country_name | league_name | season | stage | date | home_team | away_team | home_team_goal | away_team_goal | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24559 | 10 | 10 | 0 | 1 | 0 | 24 | 76 | 1 | 2 |
| 1 | 24560 | 10 | 10 | 0 | 1 | 1 | 72 | 91 | 3 | 1 |
| 2 | 24561 | 10 | 10 | 0 | 1 | 2 | 84 | 98 | 1 | 2 |
| 3 | 24562 | 10 | 10 | 0 | 1 | 2 | 173 | 100 | 1 | 2 |
| 4 | 24613 | 10 | 10 | 0 | 2 | 3 | 76 | 117 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25974 | 25945 | 10 | 10 | 7 | 36 | 1693 | 76 | 117 | 0 | 1 |
| 25975 | 25946 | 10 | 10 | 7 | 36 | 1693 | 160 | 93 | 3 | 0 |
| 25976 | 25947 | 10 | 10 | 7 | 36 | 1693 | 84 | 91 | 2 | 2 |
| 25977 | 25948 | 10 | 10 | 7 | 36 | 1693 | 95 | 24 | 0 | 3 |
| 25978 | 25949 | 10 | 10 | 7 | 36 | 1693 | 100 | 98 | 3 | 1 |
25979 rows × 10 columns
targetarray([0, 1, 0, ..., 0, 0, 1])
from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(match_wins, target, test_size=0.2, random_state=0)from sklearn.linear_model import LogisticRegressionLR = LogisticRegression()
LR.fit(x_train, y_train)/home/gao/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
LogisticRegression()
LR.score(x_test, y_test)0.9736335642802155
import shap
explainer = shap.LinearExplainer(LR, x_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)
shap.dependence_plot("home_team", shap_values, x_test)
shap.summary_plot(shap_values, x_train, plot_type="bar")
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, x_test, link="logit")shap.plots.force is slow for many thousands of rows, try subsampling your data.
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], x_test.iloc[0,:], link="logit")
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[3,:], x_test.iloc[3,:], link="logit")
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.